## Ryan Copus and Ryan Hübert
## Measuring How Much Judges Matter for Case Outcomes
## Journal of Law and Courts

# Note to reader: please review the README for important information about 
# software and other requirements

################################################################################
# SCRIPT-SPECIFIC USER-SPECIFIED VARIABLES
################################################################################

# Do you want to load partial results (if available) and resume estimation?
# If so, please set `resume <- TRUE` below
# Note: the exists() test is checking whether this script is being run passively 
#       via the 0.FullReplication.R script, which sets its own toggle for resume
if(!exists("resume")) {
  # Clear workspace (comment out if you do not want to do this for some reason)
  rm(list=ls())
  # Do you want to load partial results (if available) and resume estimation?
  resume <- FALSE # if FALSE then any existing results will be over-written
  # How much memory of your computer do you want to use to run h2o models?
  mem <- 64 # this is measured in GB
  # How many cores of your computer do you want to use to run h2o models?
  ncores <- parallel::detectCores() # this will use use all cores
} else {
  # If this script is being executed passively via the 0.FullReplication.R script
  message(paste0("Note: this script will ", 
                 ifelse(resume, "resume partially completed estimation.", 
                        "run the entire script from scratch.")))
  Sys.sleep(5)
}

################################################################################
# LOAD UTILITIES, PACKAGES AND DATA
################################################################################

# Load utils and user-supplied variables
source(here::here("Code/_config.R"))
source(here::here("Code/_utils.R"))

# Load libraries 
library("h2o") # used to run machine learning models
library("xgboost") # Some machines cannot run XGBoost in h2o & will need to use `xgboost` package

# Load dataset and lists of predictors 
load("Data/9C_DATA.rdata")
load("Data/PREDICTORS.rdata")

################################################################################
# ESTIMATE BASE MODEL (predicting panel type with only fixed effects)
################################################################################

# Set the outcome we will use for the base model
# Since this is a randomization check, we will try to predict a treatment variable
y <- "p.maj.rep"

# Set the number of "internal" folds for cross-validation inside h2o algorithms
nfolds <- 10

# Set the "external" folds
set.seed(845) # set seed to ensure same folds every time script is executed
df$fold <- sample(rep_len(1:10, nrow(df))) # we have 10 external folds

# Create a temporary dataset for prediction
jf <- df[, c("row.id", y, fes, app.predictors, dist.predictors, "fold")]

# h2o requires response variable to be a factor variable
jf[[y]] <- factor(jf[[y]])

# Data that we will collect below by replacing NAs
pf <- as.data.frame(df[,c("row.id", "fold")])
pf$rando.base <- NA 
pf$rando.full <- NA
rm(df)

# If resume==TRUE, check if there are partial results and load them
new.file <- "Data/randomization.rdata"
if(resume){
  if(!file.exists(new.file)){
    stop(paste0("The data file ", new.file, " does not exist! You must set resume=FALSE or check your working directory is correct."))
  }
  load(new.file) 
}

################################################################################
# FIT MODELS FOR BASE MODEL
################################################################################

# Keep track of any errors that arise during the estimation
error.counter <- 0

# Keep track of warnings raised in each fold:
fold.warnings <- c()

# Keep any object we've assigned so far
keep.obj <- c(ls(), "keep.obj", "rando.model", "x")

# Iterate over the folds as long as there are missing values
for(rando.model in c("rando.base", "rando.full")){
  
  if(rando.model=="rando.base"){
    x <- fes
  } else {
    x <- c(fes, app.predictors, dist.predictors)
  }
  
  while(length(GetMissingFolds(pf, rando.model, any.NA = FALSE)) > 0 & error.counter <= 5){
    
    tryCatch({
      
      # Randomly select a fold to run
      fold <- as.numeric(names(sample(GetMissingFolds(pf, rando.model, any.NA = FALSE))[1]))
      
      # Connect to the h2o server
      h2o.init(max_mem_size = paste0(mem,"g"))
      
      # Check if h2o's XGBoost algorithm is available on this machine
      xgb.test <- h2o.xgboost.available()
      
      message("\n\nNow beginning estimation for fold ", fold, " of the ", gsub("rando.","",rando.model)," model...")
      
      # Set a specific feed for each fold
      set.seed(65 + fold) # this is probably not needed, but we set just in case
      
      # Split dataset into training and hold out sets
      dfp <- jf[jf$fold != fold, ] # training
      dfe <- jf[jf$fold == fold, ] # hold out
      
      # Convert dataframes to h2o objects
      hdfp <- as.h2o(dfp)
      hdfe <- as.h2o(dfe)
      
      message("  >> fitting first GBM...")
      my_gbm2 <- h2o.gbm(x = x, 
                         y = y,
                         training_frame = hdfp,
                         model_id = paste0("gbm2", fold, "court"),
                         nfolds = nfolds,
                         fold_assignment = "Modulo",
                         keep_cross_validation_predictions = TRUE,
                         distribution = "bernoulli",
                         ntrees = 1000,
                         max_depth = 2,
                         min_rows = 2,
                         learn_rate = 0.1,
                         seed = 1, score_tree_interval = 10,
                         col_sample_rate = 0.8, sample_rate = 0.8)
      
      message("  >> fitting second GBM...")
      my_gbm3 <- h2o.gbm(x = x, 
                         y = y,
                         training_frame = hdfp,
                         model_id = paste0("gbm3", fold, "court"),
                         nfolds = nfolds,
                         fold_assignment = "Modulo",
                         keep_cross_validation_predictions = TRUE,
                         distribution = "bernoulli",
                         ntrees = 1000,
                         max_depth = 3,
                         min_rows = 2,
                         learn_rate = 0.1,
                         seed = 1)
      
      message("  >> fitting LASSO regression...")
      my_lasso <- glmWrapper(paste0("lasso", fold, "court"), x, y, hdfp, nfolds, alpha.val = 1, family = "binomial")
      
      message("  >> fitting Ridge regression...")
      my_ridge <- glmWrapper(paste0("ridge", fold, "court"), x, y, hdfp, nfolds, alpha.val = 0, family = "binomial")
      
      message("  >> fitting a Random Forest...")
      my_rf <- h2o.randomForest(x = x,
                                y = y,
                                training_frame = hdfp,
                                model_id = paste0("rf", fold, "court"),
                                nfolds = nfolds,
                                fold_assignment = "Modulo",
                                keep_cross_validation_predictions = TRUE,
                                binomial_double_trees = TRUE,
                                ntrees = 300, # Changed from 1000 since very slow
                                min_rows = 2,
                                seed = 1)
      
      if(xgb.test){
        ## XGBoost is available in your h2o installation, so run it
        
        message("  >> fitting an XGBoost (using h2o)...")
        my_xgb <- h2o.xgboost(x = x,
                              y = y,
                              training_frame = hdfp,
                              model_id = paste0("xgb", fold, "court"),
                              nfolds = nfolds,
                              fold_assignment = "Modulo",
                              keep_cross_validation_predictions = TRUE,
                              ntrees = 1000,
                              max_depth = 4,
                              learn_rate = 0.01, # from SL.xgboost (shrinkage)
                              seed = 1)
        
        message("  >> fitting an stacked ensemble...")
        
        # Set list of base models
        bm <- list(my_gbm2,
                   my_gbm3,
                   my_lasso,
                   my_ridge,
                   my_rf,
                   my_xgb)
        
        temp_ens <- h2o.stackedEnsemble(x = x,
                                        y = y,
                                        training_frame = hdfp[, c(x,y)],
                                        model_id = paste0("FE_ensemble_", fold),
                                        metalearner_algorithm = "AUTO", 
                                        metalearner_nfolds = 10,
                                        base_models = bm,
                                        seed = 1)
        
        # In hold out set: predict probability of treatment assignment
        dfe_preds <- h2o.predict(temp_ens, newdata = hdfe)[3]
        dfe_preds <- as.data.frame(dfe_preds)[, 1]
        pf[[rando.model]][pf$fold == fold] <- dfe_preds 
        
      } else {
        
        ## XGBoost is not available in your h2o installation, so use R `xgboost`
        ## Note: results may be different, and run-time is substantially longer
        
        message("  >> fitting an XGBoost (using xgboost)...")
        message("     Note: this will take a while, no progress bar will be visible")
        
        # Create train/test design matrices (model.matrix creates dummies from factors)
        train_matrix <- model.matrix(~ . - 1, data = dfp[, x]) 
        test_matrix <- model.matrix(~ . - 1, data = dfe[, x]) # Create a design matrix
        
        # Convert design matrices to `DMatrix` objects for use in XGBoost
        dtrain <- xgb.DMatrix(data = train_matrix, label = as.numeric(as.character(dfp[[y]])))
        dtest <- xgb.DMatrix(data = test_matrix)
        
        # Set our desired XGBoost parameters (equivalent to params for h2o implementation above)
        xgb_params <- list(
          objective = "binary:logistic",
          max_depth = 4,
          eta = 0.01,  # Shrinkage
          eval_metric = "logloss",
          nthread = ncores  # Note: this does not appear to do anything
        )
        
        # Train to get cross validation predictions
        set.seed(754+fold)
        cv_results <- xgb.cv(
          params = xgb_params,
          data = dtrain,
          nrounds = 1000,
          nfold = nfolds,  # 
          prediction = TRUE,  # Store out-of-fold predictions
          verbose = FALSE
        )
        
        # Train a final model on the full dataset to make test predictions
        my_xgb <- xgb.train(
          params = xgb_params,
          data = dtrain,
          nrounds = 1000,
          verbose = FALSE
        )
        
        # Get the cv predictions and the validation set predictions
        tr <- getPreds(my_gbm2, hdfe)
        tr <- getPreds(my_gbm3, hdfe, tr)
        tr <- getPreds(my_lasso, hdfe, tr)
        tr <- getPreds(my_ridge, hdfe, tr)
        tr <- getPreds(my_rf, hdfe, tr)
        tr <- getPreds(my_xgb, dtest, tr, model_cv_only = cv_results)
        
        tr[["cv.preds"]] <- cbind(tr[["cv.preds"]], dfp[,y,drop = FALSE])
        tr[["val.preds"]] <- cbind(tr[["val.preds"]], dfe[,y,drop = FALSE])
        
        message("  >> fitting an stacked ensemble...")
        
        # Train the ensemble
        temp_ens <- h2o.glm(x = colnames(tr[["cv.preds"]] )[1:ncol(tr[["cv.preds"]] )-1],
                            y = y,
                            training_frame = as.h2o(tr[["cv.preds"]]),
                            model_id = paste0("Exp_ensemble_", fold),
                            nfolds = nfolds,
                            fold_assignment = "Modulo",
                            keep_cross_validation_predictions = TRUE,
                            family = "binomial", 
                            non_negative = TRUE,
                            seed = 1)
        
        # Create panel-free predictions
        dfe_preds <- h2o.predict(temp_ens, newdata = as.h2o(tr[["val.preds"]]))[3]
        dfe_preds <- as.data.frame(dfe_preds)[, 1]
        pf[[rando.model]][pf$fold == fold] <- dfe_preds 
        
      }
      
      # Save accumulated predictions for this fold (and previous folds)
      save(pf, file=new.file)
      
      # Collect any warnings
      fold.warnings <- unique(c(fold.warnings, paste0("fold ", fold, ": ", names(warnings()))))
      print(fold.warnings)
      
      # Remove unneeded objects and shut down the h2o server
      rm(list = setdiff(ls(), keep.obj))
      h2o.removeAll()
      h2o.shutdown(prompt=FALSE)
      
      # Reset the error counter after completing iteration with no error
      error.counter <- 0
      
      # h2o server seems to have problems if you shut down and restart too quickly
      Sys.sleep(30)
      
    },
    
    error = function(e) {
      
      message("Error: ", e$message)
      error.counter <- error.counter + 1
      
      # Remove unneeded objects and shut down the h2o server
      rm(list = setdiff(ls(), keep.obj))
      h2o.removeAll()
      h2o.shutdown(prompt=FALSE)
      
      # h2o server seems to have problems if you shut down and restart too quickly
      Sys.sleep(120) # shut down for 2 minutes
      
    })
  }
}

fold.warnings <- fold.warnings[!grepl("Please download and install the latest version from",fold.warnings)]
fold.warnings <- fold.warnings[!grepl("^fold [0-9]+[:] *$",fold.warnings)]
if(length(fold.warnings) > 0){
  readr::write_file(paste0(fold.warnings,collapse = "\n"), file = "Data/warnings3.log")
}

# If we have successfully fitted algorithms for all folds, then do final steps
if(length(GetMissingFolds(pf, "rando.base", any.NA = FALSE)) + length(GetMissingFolds(pf, "rando.full", any.NA = FALSE)) == 0){
  # Save the final results as RData and csv
  save(pf, file=new.file)
  readr::write_csv(pf, file=gsub(".rdata", ".csv", new.file)) # also save copy as csv
}